{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import metrics\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "import seaborn as sns\n",
    "import xgboost as xgb\n",
    "from pandas import read_csv\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import VotingClassifier, BaggingClassifier, \\\n",
    "    AdaBoostClassifier, GradientBoostingClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../input/application_train.csv')\n",
    "test_df = pd.read_csv('../input/application_test.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Total</th>\n",
       "      <th>Percent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>COMMONAREA_MEDI</th>\n",
       "      <td>214865</td>\n",
       "      <td>0.698723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COMMONAREA_AVG</th>\n",
       "      <td>214865</td>\n",
       "      <td>0.698723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COMMONAREA_MODE</th>\n",
       "      <td>214865</td>\n",
       "      <td>0.698723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NONLIVINGAPARTMENTS_MODE</th>\n",
       "      <td>213514</td>\n",
       "      <td>0.694330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NONLIVINGAPARTMENTS_MEDI</th>\n",
       "      <td>213514</td>\n",
       "      <td>0.694330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NONLIVINGAPARTMENTS_AVG</th>\n",
       "      <td>213514</td>\n",
       "      <td>0.694330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FONDKAPREMONT_MODE</th>\n",
       "      <td>210295</td>\n",
       "      <td>0.683862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIVINGAPARTMENTS_MEDI</th>\n",
       "      <td>210199</td>\n",
       "      <td>0.683550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIVINGAPARTMENTS_MODE</th>\n",
       "      <td>210199</td>\n",
       "      <td>0.683550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIVINGAPARTMENTS_AVG</th>\n",
       "      <td>210199</td>\n",
       "      <td>0.683550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FLOORSMIN_MEDI</th>\n",
       "      <td>208642</td>\n",
       "      <td>0.678486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FLOORSMIN_MODE</th>\n",
       "      <td>208642</td>\n",
       "      <td>0.678486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FLOORSMIN_AVG</th>\n",
       "      <td>208642</td>\n",
       "      <td>0.678486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YEARS_BUILD_MEDI</th>\n",
       "      <td>204488</td>\n",
       "      <td>0.664978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YEARS_BUILD_AVG</th>\n",
       "      <td>204488</td>\n",
       "      <td>0.664978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>YEARS_BUILD_MODE</th>\n",
       "      <td>204488</td>\n",
       "      <td>0.664978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OWN_CAR_AGE</th>\n",
       "      <td>202929</td>\n",
       "      <td>0.659908</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LANDAREA_MODE</th>\n",
       "      <td>182590</td>\n",
       "      <td>0.593767</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LANDAREA_AVG</th>\n",
       "      <td>182590</td>\n",
       "      <td>0.593767</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LANDAREA_MEDI</th>\n",
       "      <td>182590</td>\n",
       "      <td>0.593767</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           Total   Percent\n",
       "COMMONAREA_MEDI           214865  0.698723\n",
       "COMMONAREA_AVG            214865  0.698723\n",
       "COMMONAREA_MODE           214865  0.698723\n",
       "NONLIVINGAPARTMENTS_MODE  213514  0.694330\n",
       "NONLIVINGAPARTMENTS_MEDI  213514  0.694330\n",
       "NONLIVINGAPARTMENTS_AVG   213514  0.694330\n",
       "FONDKAPREMONT_MODE        210295  0.683862\n",
       "LIVINGAPARTMENTS_MEDI     210199  0.683550\n",
       "LIVINGAPARTMENTS_MODE     210199  0.683550\n",
       "LIVINGAPARTMENTS_AVG      210199  0.683550\n",
       "FLOORSMIN_MEDI            208642  0.678486\n",
       "FLOORSMIN_MODE            208642  0.678486\n",
       "FLOORSMIN_AVG             208642  0.678486\n",
       "YEARS_BUILD_MEDI          204488  0.664978\n",
       "YEARS_BUILD_AVG           204488  0.664978\n",
       "YEARS_BUILD_MODE          204488  0.664978\n",
       "OWN_CAR_AGE               202929  0.659908\n",
       "LANDAREA_MODE             182590  0.593767\n",
       "LANDAREA_AVG              182590  0.593767\n",
       "LANDAREA_MEDI             182590  0.593767"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "total = train_df.isnull().sum().sort_values(ascending=False)\n",
    "percent = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)\n",
    "missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])\n",
    "missing_data.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SK_ID_CURR</th>\n",
       "      <th>TARGET</th>\n",
       "      <th>NAME_CONTRACT_TYPE</th>\n",
       "      <th>CODE_GENDER</th>\n",
       "      <th>FLAG_OWN_CAR</th>\n",
       "      <th>FLAG_OWN_REALTY</th>\n",
       "      <th>CNT_CHILDREN</th>\n",
       "      <th>AMT_INCOME_TOTAL</th>\n",
       "      <th>AMT_CREDIT</th>\n",
       "      <th>AMT_ANNUITY</th>\n",
       "      <th>...</th>\n",
       "      <th>FLAG_DOCUMENT_18</th>\n",
       "      <th>FLAG_DOCUMENT_19</th>\n",
       "      <th>FLAG_DOCUMENT_20</th>\n",
       "      <th>FLAG_DOCUMENT_21</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_HOUR</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_DAY</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_WEEK</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_MON</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_QRT</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_YEAR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100002</td>\n",
       "      <td>1</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>M</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>202500.0</td>\n",
       "      <td>406597.5</td>\n",
       "      <td>24700.5</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100003</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>N</td>\n",
       "      <td>0</td>\n",
       "      <td>270000.0</td>\n",
       "      <td>1293502.5</td>\n",
       "      <td>35698.5</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>100004</td>\n",
       "      <td>0</td>\n",
       "      <td>Revolving loans</td>\n",
       "      <td>M</td>\n",
       "      <td>Y</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>67500.0</td>\n",
       "      <td>135000.0</td>\n",
       "      <td>6750.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>100006</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>135000.0</td>\n",
       "      <td>312682.5</td>\n",
       "      <td>29686.5</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>100007</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>M</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>121500.0</td>\n",
       "      <td>513000.0</td>\n",
       "      <td>21865.5</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 122 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \\\n",
       "0      100002       1         Cash loans           M            N   \n",
       "1      100003       0         Cash loans           F            N   \n",
       "2      100004       0    Revolving loans           M            Y   \n",
       "3      100006       0         Cash loans           F            N   \n",
       "4      100007       0         Cash loans           M            N   \n",
       "\n",
       "  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \\\n",
       "0               Y             0          202500.0    406597.5      24700.5   \n",
       "1               N             0          270000.0   1293502.5      35698.5   \n",
       "2               Y             0           67500.0    135000.0       6750.0   \n",
       "3               Y             0          135000.0    312682.5      29686.5   \n",
       "4               Y             0          121500.0    513000.0      21865.5   \n",
       "\n",
       "              ...              FLAG_DOCUMENT_18 FLAG_DOCUMENT_19  \\\n",
       "0             ...                             0                0   \n",
       "1             ...                             0                0   \n",
       "2             ...                             0                0   \n",
       "3             ...                             0                0   \n",
       "4             ...                             0                0   \n",
       "\n",
       "  FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR  \\\n",
       "0                0                0                        0.0   \n",
       "1                0                0                        0.0   \n",
       "2                0                0                        0.0   \n",
       "3                0                0                        0.0   \n",
       "4                0                0                        0.0   \n",
       "\n",
       "  AMT_REQ_CREDIT_BUREAU_DAY  AMT_REQ_CREDIT_BUREAU_WEEK  \\\n",
       "0                       0.0                         0.0   \n",
       "1                       0.0                         0.0   \n",
       "2                       0.0                         0.0   \n",
       "3                       0.0                         0.0   \n",
       "4                       0.0                         0.0   \n",
       "\n",
       "   AMT_REQ_CREDIT_BUREAU_MON  AMT_REQ_CREDIT_BUREAU_QRT  \\\n",
       "0                        0.0                        0.0   \n",
       "1                        0.0                        0.0   \n",
       "2                        0.0                        0.0   \n",
       "3                        0.0                        0.0   \n",
       "4                        0.0                        0.0   \n",
       "\n",
       "   AMT_REQ_CREDIT_BUREAU_YEAR  \n",
       "0                         1.0  \n",
       "1                         0.0  \n",
       "2                         0.0  \n",
       "3                         0.0  \n",
       "4                         0.0  \n",
       "\n",
       "[5 rows x 122 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = train_df.apply(lambda x:x.fillna(x.value_counts().index[0]))\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train_df.drop(['TARGET'],axis =1)\n",
    "y_train = train_df['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = pd.get_dummies(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Total</th>\n",
       "      <th>Percent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>COMMONAREA_MEDI</th>\n",
       "      <td>33495</td>\n",
       "      <td>0.687161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COMMONAREA_AVG</th>\n",
       "      <td>33495</td>\n",
       "      <td>0.687161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COMMONAREA_MODE</th>\n",
       "      <td>33495</td>\n",
       "      <td>0.687161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NONLIVINGAPARTMENTS_MODE</th>\n",
       "      <td>33347</td>\n",
       "      <td>0.684125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NONLIVINGAPARTMENTS_MEDI</th>\n",
       "      <td>33347</td>\n",
       "      <td>0.684125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NONLIVINGAPARTMENTS_AVG</th>\n",
       "      <td>33347</td>\n",
       "      <td>0.684125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FONDKAPREMONT_MODE</th>\n",
       "      <td>32797</td>\n",
       "      <td>0.672842</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIVINGAPARTMENTS_AVG</th>\n",
       "      <td>32780</td>\n",
       "      <td>0.672493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIVINGAPARTMENTS_MEDI</th>\n",
       "      <td>32780</td>\n",
       "      <td>0.672493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LIVINGAPARTMENTS_MODE</th>\n",
       "      <td>32780</td>\n",
       "      <td>0.672493</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          Total   Percent\n",
       "COMMONAREA_MEDI           33495  0.687161\n",
       "COMMONAREA_AVG            33495  0.687161\n",
       "COMMONAREA_MODE           33495  0.687161\n",
       "NONLIVINGAPARTMENTS_MODE  33347  0.684125\n",
       "NONLIVINGAPARTMENTS_MEDI  33347  0.684125\n",
       "NONLIVINGAPARTMENTS_AVG   33347  0.684125\n",
       "FONDKAPREMONT_MODE        32797  0.672842\n",
       "LIVINGAPARTMENTS_AVG      32780  0.672493\n",
       "LIVINGAPARTMENTS_MEDI     32780  0.672493\n",
       "LIVINGAPARTMENTS_MODE     32780  0.672493"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_total = test_df.isnull().sum().sort_values(ascending=False)\n",
    "test_percent = (test_df.isnull().sum()/test_df.isnull().count()).sort_values(ascending=False)\n",
    "test_missing_data = pd.concat([test_total, test_percent], axis=1, keys=['Total', 'Percent'])\n",
    "test_missing_data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = test_df.apply(lambda x:x.fillna(x.value_counts().index[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SK_ID_CURR</th>\n",
       "      <th>CNT_CHILDREN</th>\n",
       "      <th>AMT_INCOME_TOTAL</th>\n",
       "      <th>AMT_CREDIT</th>\n",
       "      <th>AMT_ANNUITY</th>\n",
       "      <th>AMT_GOODS_PRICE</th>\n",
       "      <th>REGION_POPULATION_RELATIVE</th>\n",
       "      <th>DAYS_BIRTH</th>\n",
       "      <th>DAYS_EMPLOYED</th>\n",
       "      <th>DAYS_REGISTRATION</th>\n",
       "      <th>...</th>\n",
       "      <th>HOUSETYPE_MODE_terraced house</th>\n",
       "      <th>WALLSMATERIAL_MODE_Block</th>\n",
       "      <th>WALLSMATERIAL_MODE_Mixed</th>\n",
       "      <th>WALLSMATERIAL_MODE_Monolithic</th>\n",
       "      <th>WALLSMATERIAL_MODE_Others</th>\n",
       "      <th>WALLSMATERIAL_MODE_Panel</th>\n",
       "      <th>WALLSMATERIAL_MODE_Stone, brick</th>\n",
       "      <th>WALLSMATERIAL_MODE_Wooden</th>\n",
       "      <th>EMERGENCYSTATE_MODE_No</th>\n",
       "      <th>EMERGENCYSTATE_MODE_Yes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100001</td>\n",
       "      <td>0</td>\n",
       "      <td>135000.0</td>\n",
       "      <td>568800.0</td>\n",
       "      <td>20560.5</td>\n",
       "      <td>450000.0</td>\n",
       "      <td>0.018850</td>\n",
       "      <td>-19241</td>\n",
       "      <td>-2329</td>\n",
       "      <td>-5170.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100005</td>\n",
       "      <td>0</td>\n",
       "      <td>99000.0</td>\n",
       "      <td>222768.0</td>\n",
       "      <td>17370.0</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>0.035792</td>\n",
       "      <td>-18064</td>\n",
       "      <td>-4469</td>\n",
       "      <td>-9118.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>100013</td>\n",
       "      <td>0</td>\n",
       "      <td>202500.0</td>\n",
       "      <td>663264.0</td>\n",
       "      <td>69777.0</td>\n",
       "      <td>630000.0</td>\n",
       "      <td>0.019101</td>\n",
       "      <td>-20038</td>\n",
       "      <td>-4458</td>\n",
       "      <td>-2175.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>100028</td>\n",
       "      <td>2</td>\n",
       "      <td>315000.0</td>\n",
       "      <td>1575000.0</td>\n",
       "      <td>49018.5</td>\n",
       "      <td>1575000.0</td>\n",
       "      <td>0.026392</td>\n",
       "      <td>-13976</td>\n",
       "      <td>-1866</td>\n",
       "      <td>-2000.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>100038</td>\n",
       "      <td>1</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>625500.0</td>\n",
       "      <td>32067.0</td>\n",
       "      <td>625500.0</td>\n",
       "      <td>0.010032</td>\n",
       "      <td>-13040</td>\n",
       "      <td>-2191</td>\n",
       "      <td>-4000.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 242 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   SK_ID_CURR  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \\\n",
       "0      100001             0          135000.0    568800.0      20560.5   \n",
       "1      100005             0           99000.0    222768.0      17370.0   \n",
       "2      100013             0          202500.0    663264.0      69777.0   \n",
       "3      100028             2          315000.0   1575000.0      49018.5   \n",
       "4      100038             1          180000.0    625500.0      32067.0   \n",
       "\n",
       "   AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  \\\n",
       "0         450000.0                    0.018850      -19241          -2329   \n",
       "1         180000.0                    0.035792      -18064          -4469   \n",
       "2         630000.0                    0.019101      -20038          -4458   \n",
       "3        1575000.0                    0.026392      -13976          -1866   \n",
       "4         625500.0                    0.010032      -13040          -2191   \n",
       "\n",
       "   DAYS_REGISTRATION           ...             HOUSETYPE_MODE_terraced house  \\\n",
       "0            -5170.0           ...                                         0   \n",
       "1            -9118.0           ...                                         0   \n",
       "2            -2175.0           ...                                         0   \n",
       "3            -2000.0           ...                                         0   \n",
       "4            -4000.0           ...                                         0   \n",
       "\n",
       "   WALLSMATERIAL_MODE_Block  WALLSMATERIAL_MODE_Mixed  \\\n",
       "0                         0                         0   \n",
       "1                         0                         0   \n",
       "2                         0                         0   \n",
       "3                         0                         0   \n",
       "4                         0                         0   \n",
       "\n",
       "   WALLSMATERIAL_MODE_Monolithic  WALLSMATERIAL_MODE_Others  \\\n",
       "0                              0                          0   \n",
       "1                              0                          0   \n",
       "2                              0                          0   \n",
       "3                              0                          0   \n",
       "4                              0                          0   \n",
       "\n",
       "   WALLSMATERIAL_MODE_Panel  WALLSMATERIAL_MODE_Stone, brick  \\\n",
       "0                         0                                1   \n",
       "1                         1                                0   \n",
       "2                         1                                0   \n",
       "3                         1                                0   \n",
       "4                         1                                0   \n",
       "\n",
       "   WALLSMATERIAL_MODE_Wooden  EMERGENCYSTATE_MODE_No  EMERGENCYSTATE_MODE_Yes  \n",
       "0                          0                       1                        0  \n",
       "1                          0                       1                        0  \n",
       "2                          0                       1                        0  \n",
       "3                          0                       1                        0  \n",
       "4                          0                       1                        0  \n",
       "\n",
       "[5 rows x 242 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test = pd.get_dummies(test_df)\n",
    "X_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtrain = xgb.DMatrix(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtest = xgb.DMatrix(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params = {\n",
    "    'eta': 0.05,\n",
    "    'max_depth': 5,\n",
    "    'subsample': 0.7,\n",
    "    'colsample_bytree': 0.7,\n",
    "    'objective': 'reg:linear',\n",
    "    'eval_metric': 'rmse',\n",
    "    'silent': 1\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:0.482144\ttest-rmse:0.482166\n"
     ]
    }
   ],
   "source": [
    "cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,\n",
    "    verbose_eval=50, show_stdv=False)\n",
    "cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_boost_rounds = len(cv_output)\n",
    "model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = model.predict(dtest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output = pd.DataFrame({ 'TARGET': y_predict})\n",
    "output.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output.to_csv('submission_data_credicardOK.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
